Background

Today we’ll be looking at data on the presence and absence of the short-finned eel (Anguilla australis) at a number of sites in New Zealand. These data come from Elith, J., Leathwick, J.R., and Hastie, T., 2009. A working guide to boosted regression trees. Journal of Animal Ecology 77: 802-81.

Codebook:

load(url("http://www.stat.duke.edu/~cr173/Sta444_Sp17/data/anguilla.Rdata"))

set.seed(20170130)

part = resample_partition(anguilla, c(train=0.75, test=0.25))

anguilla = as.data.frame(part$train)
anguilla_test = as.data.frame(part$test)

EDA

Simple GLM

g = glm(presence ~ SegSumT, data=anguilla, family=binomial)
summary(g)
## 
## Call:
## glm(formula = presence ~ SegSumT, family = binomial, data = anguilla)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4619  -0.6642  -0.3822  -0.1451   2.7510  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -14.78289    1.54087  -9.594   <2e-16 ***
## SegSumT       0.78727    0.08753   8.995   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 630.04  on 616  degrees of freedom
## Residual deviance: 509.31  on 615  degrees of freedom
## AIC: 513.31
## 
## Number of Fisher Scoring iterations: 5
inv_logit = function(x) exp(x)/(1+exp(x))

pred = data_frame(
  SegSumT = seq(10, 25, length.out = 1000)
) %>%
  add_predictions(g) %>%
  mutate(pred = inv_logit(pred))

ggplot(anguilla, aes(x=SegSumT, y=presence)) +
  #geom_point() +
  geom_jitter(height=0.05, alpha=0.33) + 
  geom_line(data=pred, aes(y=pred), col='red')

Residuals

d_g = anguilla %>% 
  add_predictions(g, var="p_hat") %>%
  mutate(p_hat = inv_logit(p_hat)) %>%
  mutate(resid = presence - p_hat) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025))

ggplot(d_g, aes(x=p_hat, y=resid)) +
    geom_point()

d_g %>% 
  group_by(p_hat_bin) %>%
  summarize(resid_mean = mean(resid), resid_sd = sd(resid)) %>%
  ggplot(aes(x=p_hat_bin, y=resid_mean)) +
    geom_point()

Pearson Residuals

\[ r_i = \frac{Y_i - E(Y_i)}{Var(Y_i)} = \frac{Y_i - \hat{p}_i}{\hat{p}_i(1-\hat{p}_i)} \]

d_g = d_g %>%
  mutate(pearson = (presence - p_hat)/(p_hat*(1-p_hat)))

ggplot(d_g, aes(x=p_hat, y=pearson)) +
  geom_point()

d_g %>% 
  group_by(p_hat_bin) %>%
  summarize(pearson_mean = mean(pearson), resid_sd = sd(pearson)) %>%
  ggplot(aes(x=p_hat_bin, y=pearson_mean)) +
    geom_point()

Deviance Residuals

\[ d_i = \text{sign}(Y_i-\hat{p_i}) \sqrt{ -2 \left(Y_i \log \hat{p}_i+(1-Y_i)\log (Y_i - \hat{p}_i) \right) } \]

d_g = d_g %>%
  mutate(deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat))))

ggplot(d_g, aes(x=p_hat, y=deviance)) +
  geom_point()

d_g %>% 
  group_by(p_hat_bin) %>%
  summarize(deviance_mean = mean(deviance), deviance_sd = sd(deviance)) %>%
  ggplot(aes(x=p_hat_bin, y=deviance_mean)) +
    geom_point()

g
## 
## Call:  glm(formula = presence ~ SegSumT, family = binomial, data = anguilla)
## 
## Coefficients:
## (Intercept)      SegSumT  
##    -14.7829       0.7873  
## 
## Degrees of Freedom: 616 Total (i.e. Null);  615 Residual
## Null Deviance:       630 
## Residual Deviance: 509.3     AIC: 513.3
sum(d_g$deviance^2)
## [1] 509.305

Shiny residuals

shinyApp(
  ui = mainPanel(
    plotOutput("plot"),
    selectInput("var", label = "Variable", choices = c("resid","deviance","pearson")),
    sliderInput("bin_width", label="Bin width", min = 0, max=0.2, step = 0.005, value=0.025)
  ),
  server = function(input, output)
  {
    output$plot = renderPlot(
      d_g %>% 
        mutate(p_hat_bin = p_hat - (p_hat %% input$bin_width)) %>%
        group_by(p_hat_bin) %>%
        summarize_(mean = paste0('mean(', input$var, ')')) %>%
        ggplot(aes(x=p_hat_bin, y=mean)) +
          geom_point()
    )
  }
)

Full Model

f = glm(presence~., data=anguilla, family=binomial)
summary(f)
## 
## Call:
## glm(formula = presence ~ ., family = binomial, data = anguilla)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.15801  -0.56129  -0.28822  -0.08002   2.85046  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.869669   1.771435  -5.007 5.53e-07 ***
## SegSumT        0.623457   0.097058   6.424 1.33e-10 ***
## DSDist        -0.002802   0.002106  -1.330   0.1834    
## DSMaxSlope    -0.129213   0.068263  -1.893   0.0584 .  
## USRainDays    -0.525662   0.214254  -2.453   0.0141 *  
## USSlope       -0.051561   0.025355  -2.034   0.0420 *  
## USNative      -0.468314   0.463225  -1.011   0.3120    
## DSDam         -0.878588   0.508636  -1.727   0.0841 .  
## Methodmixture -0.066323   0.510633  -0.130   0.8967    
## Methodnet     -1.162441   0.505540  -2.299   0.0215 *  
## Methodspo     -1.650816   0.827089  -1.996   0.0459 *  
## Methodtrap    -2.953097   0.686260  -4.303 1.68e-05 ***
## LocSed        -0.213485   0.099421  -2.147   0.0318 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 630.04  on 616  degrees of freedom
## Residual deviance: 434.44  on 604  degrees of freedom
## AIC: 460.44
## 
## Number of Fisher Scoring iterations: 6

Residuals vs fitted

d_f = anguilla %>%
  add_predictions(f, var="p_hat") %>%
  mutate(p_hat = inv_logit(p_hat)) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
  mutate(
    resid = presence - p_hat,
    pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
    deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
  )

resid_f = d_f %>%
  select(p_hat:deviance) %>%
  gather(type, value, -p_hat, -p_hat_bin) %>%
  mutate(type = factor(type, levels=c("resid","pearson","deviance"))) 

ggplot(resid_f, aes(x=p_hat, y=value, color=type)) +
  geom_point(alpha=0.1) +
  facet_wrap(~type, ncol=3, scale="free_y")

resid_f %>%
  group_by(type, p_hat_bin) %>%
  summarize(mean = mean(value)) %>%
  ggplot(aes(x=p_hat_bin, y=mean, color=type)) +
    geom_point() +
    facet_wrap(~type, ncol=3, scale="free_y")

Residuals vs predictors

ggplot(d_f, aes(x=SegSumT, y=deviance, color=as.factor(presence))) + geom_point()

ggplot(d_f, aes(x=DSMaxSlope, y=deviance, color=as.factor(presence))) + geom_point()

Possible fix:

f2 = glm(presence~.+I(SegSumT^2), data=anguilla, family=binomial)
summary(f2)
## 
## Call:
## glm(formula = presence ~ . + I(SegSumT^2), family = binomial, 
##     data = anguilla)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.09241  -0.57666  -0.26048  -0.02316   2.99017  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -63.614787  22.585115  -2.817  0.00485 ** 
## SegSumT         7.175469   2.666667   2.691  0.00713 ** 
## DSDist         -0.003869   0.002158  -1.793  0.07296 .  
## DSMaxSlope     -0.128906   0.067778  -1.902  0.05719 .  
## USRainDays     -0.632217   0.223784  -2.825  0.00473 ** 
## USSlope        -0.066911   0.026264  -2.548  0.01084 *  
## USNative       -0.283431   0.471036  -0.602  0.54736    
## DSDam          -1.015057   0.514068  -1.975  0.04832 *  
## Methodmixture   0.081875   0.509820   0.161  0.87241    
## Methodnet      -1.137531   0.502683  -2.263  0.02364 *  
## Methodspo      -1.537345   0.806835  -1.905  0.05673 .  
## Methodtrap     -2.895911   0.686317  -4.219 2.45e-05 ***
## LocSed         -0.233322   0.099856  -2.337  0.01946 *  
## I(SegSumT^2)   -0.193490   0.078021  -2.480  0.01314 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 630.04  on 616  degrees of freedom
## Residual deviance: 426.80  on 603  degrees of freedom
## AIC: 454.8
## 
## Number of Fisher Scoring iterations: 7
d_f2 = anguilla %>%
  add_predictions(f2, var="p_hat") %>%
  mutate(p_hat = inv_logit(p_hat)) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
  mutate(
    resid = presence - p_hat,
    pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
    deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
  )

rbind(
  cbind(d_f,  model="linearT"),
  cbind(d_f2, model="quadT")
) %>%
  ggplot(aes(x=SegSumT, y=deviance, color=as.factor(presence))) + geom_point() + facet_wrap(~model)

d_f %>% 
  select(deviance, SegSumT:LocSed) %>%
  mutate(DSDam = factor(DSDam)) %>%
  ggpairs(
    lower = list(continuous = "cor"),
    upper = list(continuous = wrap("points", alpha = 0.05))
  )

Gradient Boosting

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
y = anguilla$presence %>% as.integer()
x = model.matrix(presence~.-1, data=anguilla)

xg = xgboost(data=x, label=y, nthead=4, nround=30, objective="binary:logistic")
## [1]  train-error:0.100486 
## [2]  train-error:0.092382 
## [3]  train-error:0.079417 
## [4]  train-error:0.079417 
## [5]  train-error:0.069692 
## [6]  train-error:0.061588 
## [7]  train-error:0.055105 
## [8]  train-error:0.045381 
## [9]  train-error:0.040519 
## [10] train-error:0.043760 
## [11] train-error:0.038898 
## [12] train-error:0.035656 
## [13] train-error:0.029173 
## [14] train-error:0.027553 
## [15] train-error:0.024311 
## [16] train-error:0.024311 
## [17] train-error:0.019449 
## [18] train-error:0.012966 
## [19] train-error:0.009724 
## [20] train-error:0.009724 
## [21] train-error:0.011345 
## [22] train-error:0.008104 
## [23] train-error:0.004862 
## [24] train-error:0.001621 
## [25] train-error:0.001621 
## [26] train-error:0.001621 
## [27] train-error:0.001621 
## [28] train-error:0.000000 
## [29] train-error:0.000000 
## [30] train-error:0.000000
pred_xg = 

d_xg = anguilla %>%
  mutate(p_hat = predict(xg, newdata=x)) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
  mutate(
    resid = presence - p_hat,
    pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
    deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
  )

resid_xg = d_xg %>%
  select(p_hat:deviance) %>%
  gather(type, value, -p_hat, -p_hat_bin) %>%
  mutate(type = factor(type, levels=c("resid","pearson","deviance"))) 

ggplot(resid_xg, aes(x=p_hat, y=value, color=type)) +
  geom_point(alpha=0.1) +
  facet_wrap(~type, ncol=3, scale="free_y")

resid_f %>%
  group_by(type, p_hat_bin) %>%
  summarize(mean = mean(value)) %>%
  ggplot(aes(x=p_hat_bin, y=mean, color=type)) +
    geom_point() +
    facet_wrap(~type, ncol=3, scale="free_y")

Predictive Performance

grid.arrange(
  ggplot(d_g, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
    geom_jitter(height=0.2, alpha=0.5) + labs(title="simple"),
  
  ggplot(d_f, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
    geom_jitter(height=0.2, alpha=0.5) + labs(title="full"),
  
  ggplot(d_xg, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
    geom_jitter(height=0.2, alpha=0.5) + labs(title="xgboost")
)

score = function(p_hat, label, cutoff)
{
  map_df(
    cutoff, 
    ~ data.frame(
      sensitivity = sum(p_hat >= . &  label) / length(p_hat),
      specificity = sum(p_hat <  . & !label) / length(p_hat)
    ) 
  )
}

score(d_f$p_hat, d_f$presence, seq(0,1,by=0.1)) %>%
  ggplot(aes(x=specificity, y=sensitivity)) +
  geom_point() + 
  geom_line()

score(d_f$p_hat, d_f$presence, seq(0,1,by=0.01)) %>%
  ggplot(aes(x=1-specificity, y=sensitivity)) +
  #geom_point() + 
  geom_line()

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
roc = function(d) 
  prediction(d$p_hat, d$presence) %>% performance(measure = "tpr", x.measure = "fpr") 

auc = function(d) 
  {prediction(d$p_hat, d$presence) %>% performance(measure = "auc")}@y.values[[1]]

perf_g  = roc(d_g)
perf_f  = roc(d_f)
perf_xg = roc(d_xg)

plot(perf_g, col="#7fc97f", lwd=2)
plot(perf_f, col="#beaed4",  lwd=2, add=TRUE)
plot(perf_xg, col="#fdc086",  lwd=2, add=TRUE)
abline(a=0, b = 1, col="grey")

auc(d_g)
## [1] 0.7993274
auc(d_f)
## [1] 0.8694881
auc(d_xg)
## [1] 1

Out of sample ROC

d_f_test = anguilla_test %>%
  add_predictions(f, var="p_hat") %>%
  mutate(p_hat = inv_logit(p_hat)) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
  mutate(
    resid = presence - p_hat,
    pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
    deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
  )


d_xg_test = anguilla_test %>%
  mutate(p_hat = predict(xg, newdata=model.matrix(presence~.-1, data=anguilla_test))) %>%
  mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
  mutate(
    resid = presence - p_hat,
    pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
    deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
  )
grid.arrange(
  ggplot(d_f_test, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
    geom_jitter(height=0.2, alpha=0.5) + labs(title="full"),
  
  ggplot(d_xg_test, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
    geom_jitter(height=0.2, alpha=0.5) + labs(title="xgboost")
)

plot(roc(d_f_test), col="#beaed4",  lwd=2)
plot(roc(d_xg_test), col="#fdc086",  lwd=2, add=TRUE)
abline(a=0, b = 1, col="grey")

auc(d_f_test)
## [1] 0.8574315
auc(d_xg_test)
## [1] 0.8848485